{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "from catboost import *\n",
    "from lightgbm import LGBMClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.model_selection import StratifiedKFold,train_test_split,GridSearchCV\n",
    "from sklearn.metrics import log_loss"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This Solution was prepared by SAI Krithik Rank 8 and Karan Juneja Rank 6, this code will run on kaggle and Colab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv(\"train.csv\")\n",
    "\n",
    "df_test = pd.read_csv(\"test.csv\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train['Hydro_fire'] = df_train['Horizontal_Distance_To_Fire_Points(meters)'] +  df_train['Horizontal_Distance_To_Hydrology(meters)']\n",
    "df_train['Hydro_Road'] = df_train['Horizontal_Distance_To_Roadways(meters)'] +  df_train['Horizontal_Distance_To_Hydrology(meters)']\n",
    "df_train['Road_fire'] = df_train['Horizontal_Distance_To_Fire_Points(meters)'] +  df_train['Horizontal_Distance_To_Roadways(meters)']\n",
    "\n",
    "df_test['Hydro_fire'] = df_test['Horizontal_Distance_To_Fire_Points(meters)'] +  df_test['Horizontal_Distance_To_Hydrology(meters)']\n",
    "df_test['Hydro_Road'] = df_test['Horizontal_Distance_To_Roadways(meters)'] +  df_test['Horizontal_Distance_To_Hydrology(meters)']\n",
    "df_test['Road_fire'] = df_test['Horizontal_Distance_To_Fire_Points(meters)'] +  df_test['Horizontal_Distance_To_Roadways(meters)']\n",
    "\n",
    "df_train['Hydro_Road_sub'] = np.abs(df_train['Horizontal_Distance_To_Roadways(meters)'] -  df_train['Horizontal_Distance_To_Hydrology(meters)'])\n",
    "df_train['Hydro_fire_sub'] = np.abs(df_train['Horizontal_Distance_To_Fire_Points(meters)'] -  df_train['Horizontal_Distance_To_Hydrology(meters)'])\n",
    "df_train['Road_fire_sub'] = np.abs(df_train['Horizontal_Distance_To_Fire_Points(meters)'] -  df_train['Horizontal_Distance_To_Roadways(meters)'])\n",
    "df_test['Hydro_fire_sub'] = np.abs(df_test['Horizontal_Distance_To_Fire_Points(meters)'] -  df_test['Horizontal_Distance_To_Hydrology(meters)'])\n",
    "df_test['Hydro_Road_sub'] = np.abs(df_test['Horizontal_Distance_To_Roadways(meters)'] -  df_test['Horizontal_Distance_To_Hydrology(meters)'])\n",
    "df_test['Road_fire_sub'] = np.abs(df_test['Horizontal_Distance_To_Fire_Points(meters)'] -  df_test['Horizontal_Distance_To_Roadways(meters)'])\n",
    "\n",
    "df_train['RAD_SLOPE'] = df_train['Slope(degrees)'].apply(lambda x: x*(np.pi/180))\n",
    "df_train['RAD_Aspect'] = df_train['Aspect(degrees)'].apply(lambda x: x*(np.pi/180))\n",
    "df_test['RAD_SLOPE'] = df_test['Slope(degrees)'].apply(lambda x: x*(np.pi/180))\n",
    "df_test['RAD_Aspect'] = df_test['Aspect(degrees)'].apply(lambda x: x*(np.pi/180))\n",
    "\n",
    "\n",
    "df_train['EL_DIS'] = df_train['Elevation(meters)'] - df_train['Horizontal_Distance_To_Hydrology(meters)']*0.2\n",
    "df_train['EL_Fire'] = df_train['Elevation(meters)'] - df_train['Horizontal_Distance_To_Fire_Points(meters)']*0.2\n",
    "df_train['EL_Road'] = df_train['Elevation(meters)'] - df_train['Horizontal_Distance_To_Roadways(meters)']*0.2\n",
    "df_test['EL_DIS'] = df_test['Elevation(meters)'] - df_test['Horizontal_Distance_To_Hydrology(meters)']*0.2\n",
    "df_test['EL_Fire'] = df_test['Elevation(meters)'] - df_test['Horizontal_Distance_To_Fire_Points(meters)']*0.2\n",
    "df_test['EL_Road'] = df_test['Elevation(meters)'] - df_test['Horizontal_Distance_To_Roadways(meters)']*0.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_train['Ele_hydro_add'] = df_train['Elevation(meters)'] + df_train['Vertical_Distance_To_Hydrology(meters)']\n",
    "# df_train['Ele_hydro_sub'] = df_train['Elevation(meters)'] - df_train['Vertical_Distance_To_Hydrology(meters)']\n",
    "# df_test['Ele_hydro_add'] = df_test['Elevation(meters)'] + df_test['Vertical_Distance_To_Hydrology(meters)']\n",
    "# df_test['Ele_hydro_sub'] = df_test['Elevation(meters)'] - df_test['Vertical_Distance_To_Hydrology(meters)']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_train['Hill_9am_diff'] = df_train['Hillshade_Noon'] - df_train['Hillshade_9am']\n",
    "# df_train['Hill_Noon_diff'] = df_train['Hillshade_Noon'] - df_train['Hillshade_3pm']\n",
    "# df_train['Hillshade_9am_3pm']= df_train['Hillshade_9am'] - df_train['Hillshade_3pm']\n",
    "\n",
    "# df_test['Hill_9am_diff'] = df_test['Hillshade_Noon'] - df_test['Hillshade_9am']\n",
    "# df_test['Hill_Noon_diff'] = df_test['Hillshade_Noon'] - df_test['Hillshade_3pm']\n",
    "# df_test['Hillshade_9am_3pm']= df_test['Hillshade_9am'] - df_test['Hillshade_3pm']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_train['Hyp_Elv_Hydro'] = np.sqrt((df_train['Horizontal_Distance_To_Hydrology(meters)'])**2 + (df_train['Elevation(meters)'])**2)\n",
    "# df_train['Hyp_Elv_Fire'] = np.sqrt((df_train['Horizontal_Distance_To_Fire_Points(meters)'])**2 + (df_train['Elevation(meters)'])**2)\n",
    "# df_train['Hyp_Elv_Road'] = np.sqrt((df_train['Horizontal_Distance_To_Roadways(meters)'])**2 + (df_train['Elevation(meters)'])**2)\n",
    "# df_test['Hyp_Elv_Hydro'] = np.sqrt((df_test['Horizontal_Distance_To_Hydrology(meters)'])**2 + (df_test['Elevation(meters)'])**2)\n",
    "# df_test['Hyp_Elv_Fire'] = np.sqrt((df_test['Horizontal_Distance_To_Fire_Points(meters)'])**2 + (df_test['Elevation(meters)'])**2)\n",
    "# df_test['Hyp_Elv_Road'] = np.sqrt((df_test['Horizontal_Distance_To_Roadways(meters)'])**2 + (df_test['Elevation(meters)'])**2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = df_train.drop('Cover_Type',axis=1)\n",
    "y_train = df_train['Cover_Type']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5    0.487608\n",
       "7    0.364613\n",
       "6    0.061549\n",
       "1    0.035284\n",
       "3    0.029880\n",
       "2    0.016351\n",
       "4    0.004716\n",
       "Name: Cover_Type, dtype: float64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_train.value_counts(normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "splits = 5\n",
    "folds = StratifiedKFold(n_splits=splits, shuffle=True, random_state=22)\n",
    "# predictions = np.zeros((len(X_valid), 7))\n",
    "oof_preds = np.zeros((len(df_test), 7))\n",
    "feature_importance_df = pd.DataFrame()\n",
    "final_preds = []\n",
    "random_state = [77,89,22,1007,1997,1890,2000,2020,8989,786,787,1999992,2021,7654]\n",
    "for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train.values, y_train.values)):\n",
    "        print(\"Fold {}\".format(fold_))\n",
    "        X_trn,y_trn = X_train.iloc[trn_idx],y_train.iloc[trn_idx]\n",
    "        X_val,y_val = X_train.iloc[val_idx],y_train.iloc[val_idx]\n",
    "        clf=LGBMClassifier(random_state=22,n_jobs=-1,max_depth=-1,min_data_in_leaf=17,num_leaves=67,colsample_bytree=0.9,\n",
    "                        bagging_fraction=0.1,lambda_l2=1.1,n_estimators=5000,**params_gpu)\n",
    "        clf.fit(X_trn, y_trn, eval_metric=\"logloss\", eval_set=[(X_val,y_val)], verbose=True,early_stopping_rounds=100)\n",
    "        final_preds.append(log_loss(y_pred=clf.predict_proba(X_val),y_true=y_val))\n",
    "#         predictions += clf.predict_proba(X_valid)\n",
    "        oof_preds += clf.predict_proba(df_test)\n",
    "\n",
    "oof_preds = oof_preds/splits\n",
    "print(sum(final_preds)/splits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(log_loss(y_pred=oof_preds,y_true=y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This with 10 folds and above features will give you a score of 0.25xx on lb , with xgboost you might get 0.23xx"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Pseduo Labelling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list2=[]\n",
    "def checkconf(z):\n",
    "    threshhold = 0.99\n",
    "    if max(z)>threshhold :\n",
    "        return np.argmax(z)+1\n",
    "    else:\n",
    "        return np.nan\n",
    "for i in oof_preds:\n",
    "    list2.append(checkconf(i))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "the way we did pseudo labelling was using a stratification approach to keep the distribution same across the training dataset,\n",
    "it worked amazingly with catboost but still need to refine it even more"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test['Cover_Type'] = list2\n",
    "temp = df_test.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train['Cover_Type'] = y_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp_train = pd.concat([df_train,temp])\n",
    "X_train = temp_train.drop('Cover_Type',axis=1)\n",
    "y_train = temp_train['Cover_Type']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "catlist=[]\n",
    "for i in range(len(X_train.columns)-1):\n",
    "    if X_train.nunique()[i] <5:\n",
    "        catlist.append(i)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To make the solution as reproducible as possible,i created a pickled file with new labels, as while using a GPU predictions change a bit so final predictions change too."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle \n",
    "with open('psuedo_label_Train (1).pkl','rb') as f:\n",
    "     pseudo_train_final = pickle.load(f)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Elevation(meters)</th>\n",
       "      <th>Aspect(degrees)</th>\n",
       "      <th>Slope(degrees)</th>\n",
       "      <th>Horizontal_Distance_To_Hydrology(meters)</th>\n",
       "      <th>Vertical_Distance_To_Hydrology(meters)</th>\n",
       "      <th>Horizontal_Distance_To_Roadways(meters)</th>\n",
       "      <th>Hillshade_9am</th>\n",
       "      <th>Hillshade_Noon</th>\n",
       "      <th>Hillshade_3pm</th>\n",
       "      <th>Horizontal_Distance_To_Fire_Points(meters)</th>\n",
       "      <th>...</th>\n",
       "      <th>Hydro_Road</th>\n",
       "      <th>Road_fire</th>\n",
       "      <th>Hydro_Road_sub</th>\n",
       "      <th>Hydro_fire_sub</th>\n",
       "      <th>Road_fire_sub</th>\n",
       "      <th>RAD_SLOPE</th>\n",
       "      <th>RAD_Aspect</th>\n",
       "      <th>EL_DIS</th>\n",
       "      <th>EL_Fire</th>\n",
       "      <th>EL_Road</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>80378</th>\n",
       "      <td>2893</td>\n",
       "      <td>27</td>\n",
       "      <td>17</td>\n",
       "      <td>42</td>\n",
       "      <td>-10</td>\n",
       "      <td>4377</td>\n",
       "      <td>211</td>\n",
       "      <td>202</td>\n",
       "      <td>123</td>\n",
       "      <td>3914</td>\n",
       "      <td>...</td>\n",
       "      <td>4419</td>\n",
       "      <td>8291</td>\n",
       "      <td>4335</td>\n",
       "      <td>3872</td>\n",
       "      <td>463</td>\n",
       "      <td>0.296706</td>\n",
       "      <td>0.471239</td>\n",
       "      <td>2884.6</td>\n",
       "      <td>2110.2</td>\n",
       "      <td>2017.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>344116</th>\n",
       "      <td>3367</td>\n",
       "      <td>259</td>\n",
       "      <td>21</td>\n",
       "      <td>201</td>\n",
       "      <td>57</td>\n",
       "      <td>3792</td>\n",
       "      <td>164</td>\n",
       "      <td>246</td>\n",
       "      <td>218</td>\n",
       "      <td>902</td>\n",
       "      <td>...</td>\n",
       "      <td>3993</td>\n",
       "      <td>4694</td>\n",
       "      <td>3591</td>\n",
       "      <td>701</td>\n",
       "      <td>2890</td>\n",
       "      <td>0.366519</td>\n",
       "      <td>4.520403</td>\n",
       "      <td>3326.8</td>\n",
       "      <td>3186.6</td>\n",
       "      <td>2608.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>407990</th>\n",
       "      <td>3323</td>\n",
       "      <td>170</td>\n",
       "      <td>12</td>\n",
       "      <td>531</td>\n",
       "      <td>103</td>\n",
       "      <td>1326</td>\n",
       "      <td>229</td>\n",
       "      <td>245</td>\n",
       "      <td>147</td>\n",
       "      <td>2136</td>\n",
       "      <td>...</td>\n",
       "      <td>1857</td>\n",
       "      <td>3462</td>\n",
       "      <td>795</td>\n",
       "      <td>1605</td>\n",
       "      <td>810</td>\n",
       "      <td>0.209440</td>\n",
       "      <td>2.967060</td>\n",
       "      <td>3216.8</td>\n",
       "      <td>2895.8</td>\n",
       "      <td>3057.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>303718</th>\n",
       "      <td>2317</td>\n",
       "      <td>318</td>\n",
       "      <td>26</td>\n",
       "      <td>30</td>\n",
       "      <td>12</td>\n",
       "      <td>860</td>\n",
       "      <td>144</td>\n",
       "      <td>207</td>\n",
       "      <td>198</td>\n",
       "      <td>849</td>\n",
       "      <td>...</td>\n",
       "      <td>890</td>\n",
       "      <td>1709</td>\n",
       "      <td>830</td>\n",
       "      <td>819</td>\n",
       "      <td>11</td>\n",
       "      <td>0.453786</td>\n",
       "      <td>5.550147</td>\n",
       "      <td>2311.0</td>\n",
       "      <td>2147.2</td>\n",
       "      <td>2145.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>556659</th>\n",
       "      <td>3357</td>\n",
       "      <td>104</td>\n",
       "      <td>23</td>\n",
       "      <td>180</td>\n",
       "      <td>8</td>\n",
       "      <td>2259</td>\n",
       "      <td>251</td>\n",
       "      <td>203</td>\n",
       "      <td>66</td>\n",
       "      <td>4542</td>\n",
       "      <td>...</td>\n",
       "      <td>2439</td>\n",
       "      <td>6801</td>\n",
       "      <td>2079</td>\n",
       "      <td>4362</td>\n",
       "      <td>2283</td>\n",
       "      <td>0.401426</td>\n",
       "      <td>1.815142</td>\n",
       "      <td>3321.0</td>\n",
       "      <td>2448.6</td>\n",
       "      <td>2905.2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 66 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Elevation(meters)  Aspect(degrees)  Slope(degrees)  \\\n",
       "80378                2893               27              17   \n",
       "344116               3367              259              21   \n",
       "407990               3323              170              12   \n",
       "303718               2317              318              26   \n",
       "556659               3357              104              23   \n",
       "\n",
       "        Horizontal_Distance_To_Hydrology(meters)  \\\n",
       "80378                                         42   \n",
       "344116                                       201   \n",
       "407990                                       531   \n",
       "303718                                        30   \n",
       "556659                                       180   \n",
       "\n",
       "        Vertical_Distance_To_Hydrology(meters)  \\\n",
       "80378                                      -10   \n",
       "344116                                      57   \n",
       "407990                                     103   \n",
       "303718                                      12   \n",
       "556659                                       8   \n",
       "\n",
       "        Horizontal_Distance_To_Roadways(meters)  Hillshade_9am  \\\n",
       "80378                                      4377            211   \n",
       "344116                                     3792            164   \n",
       "407990                                     1326            229   \n",
       "303718                                      860            144   \n",
       "556659                                     2259            251   \n",
       "\n",
       "        Hillshade_Noon  Hillshade_3pm  \\\n",
       "80378              202            123   \n",
       "344116             246            218   \n",
       "407990             245            147   \n",
       "303718             207            198   \n",
       "556659             203             66   \n",
       "\n",
       "        Horizontal_Distance_To_Fire_Points(meters)  ...  Hydro_Road  \\\n",
       "80378                                         3914  ...        4419   \n",
       "344116                                         902  ...        3993   \n",
       "407990                                        2136  ...        1857   \n",
       "303718                                         849  ...         890   \n",
       "556659                                        4542  ...        2439   \n",
       "\n",
       "        Road_fire  Hydro_Road_sub  Hydro_fire_sub  Road_fire_sub  RAD_SLOPE  \\\n",
       "80378        8291            4335            3872            463   0.296706   \n",
       "344116       4694            3591             701           2890   0.366519   \n",
       "407990       3462             795            1605            810   0.209440   \n",
       "303718       1709             830             819             11   0.453786   \n",
       "556659       6801            2079            4362           2283   0.401426   \n",
       "\n",
       "        RAD_Aspect  EL_DIS  EL_Fire  EL_Road  \n",
       "80378     0.471239  2884.6   2110.2   2017.6  \n",
       "344116    4.520403  3326.8   3186.6   2608.6  \n",
       "407990    2.967060  3216.8   2895.8   3057.8  \n",
       "303718    5.550147  2311.0   2147.2   2145.0  \n",
       "556659    1.815142  3321.0   2448.6   2905.2  \n",
       "\n",
       "[5 rows x 66 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pseudo_train_final.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Also Catboost gives different preditcions on different gpu  becasue of so nvidia gpu using single precision values so please take care of that\n",
    "#Running the above pickled file in kaggle should reproduce a score of \n",
    "#0.2014-0.2017"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cb = CatBoostClassifier(random_state=27,task_type=\"GPU\", devices=\"0:1\",\n",
    "                        cat_features = catlist,verbose=200,n_estimators=2800)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub = pd.DataFrame(preds,columns=[7,5,6,4,2,3,1])\n",
    "sub = sub[[1,2,3,4,5,6,7]]\n",
    "sub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "preds = cb.predict_proba(df_test,axis=1)\n",
    "sub = pd.DataFrame(preds,columns=[1,2,3,4,5,6,7])\n",
    "sub"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Also getting 0.20xx with pseudo label is the only way of achieving that score, so if any other solution with out this is very difficult to achieve this score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
